Purpose:

Calculate and plot mutational signatures for all samples using COSMIC signatures and Alexandrov et al, 2013 mutational signatures.

Summary of Findings:

Coming soon.

Usage

To run this from the command line, use:

Rscript -e "rmarkdown::render('analyses/mutational-signatures/mutational_signatures.Rmd', 
                              clean = TRUE)"

This assumes you are in the top directory of the repository.

Setup

Packages and functions

Import necessary functions.

if (!("deconstructSigs" %in% installed.packages())) {
  BiocManager::install("BSgenome.Hsapiens.UCSC.hg38")
  install.packages("deconstructSigs")
}
if (!("devtools" %in% installed.packages())) {
  install.packages("devtools")
}
if (!("sigfit" %in% installed.packages())) {
  devtools::install_github("kgori/sigfit", build_vignettes = TRUE,
    build_opts = c("--no-resave-data", "--no-manual"))
}
# Magrittr pipe
`%>%` <- dplyr::`%>%`

# Import specialized functions
source(file.path("util", "mut_sig_functions.R"))

# Load this library
library(deconstructSigs)
library(sigfit)

Set up directory paths.

data_dir <- file.path("..", "..", "data")
results_dir <- "results"
plots_dir <- "plots"
cosmic_plots <- file.path(plots_dir, "cosmic")
nature_plots <- file.path(plots_dir, "nature")
denovo_plots <- file.path(plots_dir, "denovo")

Make new directories for the results.

if (!dir.exists(results_dir)) {
  dir.create(results_dir)
}
if (!dir.exists(cosmic_plots)) {
  dir.create(cosmic_plots, recursive = TRUE)
}
if (!dir.exists(nature_plots)) {
  dir.create(nature_plots, recursive = TRUE)
}

Read in data

Read in the consensus MAF file.

# Declare file path for consensus file
consensus_file <- file.path(data_dir, "pbta-snv-consensus-mutation.maf.tsv.gz")

Read in the consensus MAF file.

# Read in the file
maf <- data.table::fread(consensus_file, data.table = FALSE)

Read in the metadata.

metadata_df <- readr::read_tsv(file.path(data_dir, "pbta-histologies.tsv")) %>% 
  dplyr::rename(Tumor_Sample_Barcode = "Kids_First_Biospecimen_ID") %>%
  dplyr::select("Tumor_Sample_Barcode", "experimental_strategy", "short_histology")
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   OS_days = col_double(),
##   age_last_update_days = col_double(),
##   normal_fraction = col_double(),
##   tumor_fraction = col_double(),
##   tumor_ploidy = col_double(),
##   molecular_subtype = col_logical()
## )
## See spec(...) for full column specifications.
## Warning: 493 parsing failures.
##  row               col           expected actual                              file
## 2334 molecular_subtype 1/0/T/F/TRUE/FALSE Group3 '../../data/pbta-histologies.tsv'
## 2335 molecular_subtype 1/0/T/F/TRUE/FALSE Group4 '../../data/pbta-histologies.tsv'
## 2336 molecular_subtype 1/0/T/F/TRUE/FALSE Group3 '../../data/pbta-histologies.tsv'
## 2337 molecular_subtype 1/0/T/F/TRUE/FALSE Group3 '../../data/pbta-histologies.tsv'
## 2338 molecular_subtype 1/0/T/F/TRUE/FALSE Group3 '../../data/pbta-histologies.tsv'
## .... ................. .................. ...... .................................
## See problems(...) for more details.

Read in this list so we can make sure we keep only primary tumors for the grouped bar plots.

ind_samples <- readr::read_tsv(file.path(
  data_dir,
  "independent-specimens.wgswxs.primary.tsv"
))
## Parsed with column specification:
## cols(
##   Kids_First_Participant_ID = col_character(),
##   Kids_First_Biospecimen_ID = col_character()
## )

Read in the WGS and WXS regions so they can be used for the Mb denominator.

# Set up BED region files for TMB calculations
wgs_bed <- readr::read_tsv(file.path(data_dir, "WGS.hg38.strelka2.unpadded.bed"),
  col_names = FALSE
)
## Parsed with column specification:
## cols(
##   X1 = col_character(),
##   X2 = col_double(),
##   X3 = col_double()
## )
wxs_bed <- readr::read_tsv(file.path(data_dir, "WXS.hg38.100bp_padded.bed"),
  col_names = FALSE
)
## Parsed with column specification:
## cols(
##   X1 = col_character(),
##   X2 = col_double(),
##   X3 = col_double()
## )
# Calculate size of genome surveyed
# These files are BED files where the third column is the End position and
# the second column is the Start position.
# So End - Start gives the size of each range. Sum the gives the total size in bp.
wgs_size <- sum(wgs_bed[, 3] - wgs_bed[, 2])
wxs_size <- sum(wxs_bed[, 3] - wxs_bed[, 2])

Set up data

Determine how many mutations we have per sample.

mut_per_sample <- maf %>%
  dplyr::group_by(Tumor_Sample_Barcode) %>%
  dplyr::tally() %>%
  dplyr::arrange(n)

summary(mut_per_sample$n)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1      81     255    1309     802  139938

Graph this.

ggplot2::ggplot(mut_per_sample, ggplot2::aes(x = n, geom = "density")) +
  ggplot2::geom_density() +
  ggplot2::theme_classic()

Make mutation data into deconstructSigs input format.

# Convert to deconstructSigs input
sigs_input <- mut.to.sigs.input(
  mut.ref = maf,
  sample.id = "Tumor_Sample_Barcode",
  chr = "Chromosome",
  pos = "Start_Position",
  ref = "Reference_Allele",
  alt = "Allele",
  bsg = BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38
)
## Warning: multiple methods tables found for 'type'
## Warning: multiple methods tables found for 'splitAsList'
## Warning: replacing previous import 'BiocGenerics::type' by 'DelayedArray::type'
## when loading 'SummarizedExperiment'
## Warning: replacing previous import
## 'GenomicRanges::.__C__GenomicRanges_OR_GenomicRangesList' by
## 'rtracklayer::.__C__GenomicRanges_OR_GenomicRangesList' when loading 'BSgenome'
## Warning in mut.to.sigs.input(mut.ref = maf, sample.id = "Tumor_Sample_Barcode", : Some samples have fewer than 50 mutations:
##   BS_4XPPZTGG, BS_3J4T2YYW, BS_541KKASA, BS_5FBN4PPQ, BS_WD7AQV83, BS_YEWX69X4, BS_QBZDQX7A, BS_T8C13KNH, BS_5TT6TT4K, BS_V2MDX7HG, BS_GRECE8Q9, BS_KKWQW01N, BS_MBYEK6HV, BS_78TQS4Y3, BS_8BM1WTGR, BS_M9YZK3SP, BS_QYWGB52C, BS_XDY9BYRK, BS_AR2BH0Z4, BS_WD37M8SD, BS_FBJ516WW, BS_6HE8H9HY, BS_KSHETTQC, BS_9F2J6VS1, BS_FJEBFQHG, BS_AM2TZPNS, BS_QSMFVHSB, BS_P7VR731D, BS_CRKBDAYZ, BS_ZHJFX6C9, BS_CWQ35N2K, BS_SSYNTXDC, BS_CQ8C6X4X, BS_XVNNWN2Z, BS_B558FJXC, BS_N8D9FH6M, BS_DMBNB0EP, BS_6B62YT97, BS_ACTR13BQ, BS_11322HD1, BS_MSGYZ69T, BS_HYGK88B0, BS_VXNT1TDB, BS_WSFB4SA2, BS_06GPPDHZ, BS_74SJ7YV4, BS_ZKFEMJZ8, BS_H83DTMT2, BS_KM878Y43, BS_5Z4XQC9X, BS_H3TYF6GD, BS_TYFD8ZMG, BS_E5B2YZJH, BS_DT92H1MJ, BS_EKR4NSNW, BS_AEEKJ1QK, BS_NMJPX8VQ, BS_1RF75MK2, BS_YZ2YNEJJ, BS_328C4BW5, BS_EDCQ4M0R, BS_W6B48DD5, BS_52ETE050, BS_XPZAD9PP, BS_PZDTK87A, BS_E7HEQZ1K, BS_J037K4BM, BS_41SWNWAG, BS_R922P3XJ, BS_FG649AWK, BS_W50WEJE7, BS_886M7JMG, BS_7KR13R3P, BS_CN7PB0DN, BS_F7KYPE79, BS_DAEM3WRX, BS_WXKCR17Q, BS_B1X2SB4P, BS_YPCTK927, BS_HNE6BPMH, BS_CDKC1E2B, BS_R3WB0PP7, BS_QD22MJJ5, BS_AGAW7M7W, BS_1RTE2KEX, BS_A6YJ3WG2, BS_XWQTX5HH, BS_Q5XKTB1V, BS_402W79TS, BS_QSDKJM8T, BS_WYTDVC0Y, BS_N2JD3VX6, BS_NTXHP7QR, BS_2KMB51YF, BS_C5RDMCCS, BS_AXW0XBH8, BS_SDT1J3A7, BS_6KEYD1S4, BS_6GTKRGNK, BS_8JFMP1T1, BS_EHBH0NYX, BS_S88G7BXH, BS_BSM3ZHW4, BS_Q807ENGY, BS_6Y7FX7QJ, BS_D3XVXF05, BS_RR3DKGKY, BS_9QMVZ4C0, BS_9W6FK6X4, BS_JJNSP29S, BS_BVD7PWP5, BS_MS87PMR7, BS_7Y5E5KQW, BS_2VHFP7M8, BS_MHT5C2AP, BS_W8DA9AXT, BS_7BNQBB26, BS_MWXDJFWW, BS_B6KCJF2B, BS_74TZJTW0, BS_93BV8AY9, BS_93B38HPS, BS_YVFFHJ1Q, BS_6VTQ3W4M, BS_5S8VXASX, BS_SE82EQF8, BS_R1RMKH1B, BS_CFEWVDG1, BS_WJ7GMQTF, BS_YZ0P5E8H, BS_XKXDH0YJ, BS_XCFNEZ8E, BS_N0Z6V2YG, BS_947CK40E, BS_9E6Q11G2, BS_NHMPQ8Q1, BS_SYWRCE3Z, BS_ZP89F1JY, BS_B4DY7ET3, BS_4M0ZMCDC, BS_BS5X4H0Y, BS_94AP61Q5, BS_PDRNE606, BS_C0YDDQKB, BS_KV3XF9WV, BS_2JDVY4F7, BS_6DT506HY, BS_JCWD4DMA, BS_1135HC0V, BS_1YTHM07J, BS_895FJ34T, BS_5CCAC8TZ, BS_AXM96K20, BS_797V83YM, BS_V3ZNXRHK, BS_EQE19CB0, BS_71HPXFE7, BS_ZJT1REVB, BS_CPZZR2R7, BS_M1YHV93N, BS_G61CN2V0, BS_GAA327FP, BS_NS8G0465, BS_8CA214YR, BS_PMSZKZP7, BS_BZWM455T, BS_2HDEZFTW, BS_1RFBH1SP, BS_922YMFYK, BS_8Q8CAY84, BS_6687ZNGS, BS_JJWC13GG, BS_K07KNTFY, BS_9DN4QR6E, BS_2E81A3FT, BS_SCQNN2A2, BS_MCM78YPC, BS_5Y2Q63Y4, BS_AQMKA8NC, BS_2VSN5YC5, BS_969K7ZM1, BS_V3SC6NWH, BS_NDSWS9TS, BS_NEX92KQH, BS_TPX7YY57, BS_5J5VH3X0, BS_N5VYY66W, BS_TX2WGF8K, BS_KPG4WCHS, BS_FWP8ZA4K, BS_HM5N78GV, BS_K049HVGR, BS_K14VJ1E3, BS_FWCKP4X1, BS_89RCN9PF, BS_Z14Z2JMW, BS_Y0DGBKTY, BS_PVZJCXS1, BS_0QEBZX3M, BS_N8WXTFN4, BS_ETTZM63Y, BS_CX6KACX6, BS_9R82A3VT

Add total mutations per sample.

# Count the total number of signature mutations for each sample
total_muts <- apply(sigs_input, 1, sum)

Determine Signatures for COSMIC and Alexandrov et al, 2013

Get list of tumor sample ids.

tumor_sample_ids <- maf %>%
  dplyr::filter(Tumor_Sample_Barcode %in% rownames(sigs_input)) %>%
  dplyr::distinct(Tumor_Sample_Barcode) %>%
  dplyr::pull(Tumor_Sample_Barcode)

Get COSMIC signatures for each sample. This step will take some time.

sample_sigs_cosmic <- lapply(tumor_sample_ids, function(sample_id) {
  # Determine the signatures contributing to the sample
  whichSignatures(
    tumor.ref = sigs_input,
    signatures.ref = signatures.cosmic,
    sample.id = sample_id,
    contexts.needed = TRUE
  )
})
# Bring along the names
names(sample_sigs_cosmic) <- tumor_sample_ids

Get Alexandrov et al, 2013 signatures for each sample.

sample_sigs_nature <- lapply(tumor_sample_ids, function(sample_id) {
  # Determine the signatures contributing to the sample
  whichSignatures(
    tumor.ref = sigs_input,
    signatures.ref = signatures.nature2013,
    sample.id = sample_id,
    contexts.needed = TRUE
  )
})
# Bring along the names
names(sample_sigs_nature) <- tumor_sample_ids

COSMIC signature plots

sample_mut_sig_plot(sample_sigs_cosmic,
  label = "cosmic",
  output_dir = file.path(cosmic_plots, "individual_mutation_sig")
)

Alexandrov et al, 2013 signature plots

sample_mut_sig_plot(sample_sigs_nature,
  label = "nature",
  output_dir = file.path(nature_plots, "individual_mutation_sig")
)

Calculate the mutations per Mb for each signature

Do this for COSMIC mutation signatures.

# Calculate mutations per signature
cosmic_sigs_df <- calc_mut_per_sig(sample_sigs_cosmic,
  muts_per_sample = total_muts,
  wgs_genome_size = wgs_size,
  wxs_exome_size = wxs_size,
  metadata = metadata_df
) 
## Using Tumor_Sample_Barcode, experimental_strategy, short_histology as id variables
# Write this to a file  
readr::write_tsv(cosmic_sigs_df, 
                 file.path(results_dir, "cosmic_signatures_results.tsv"))

# Print out a preview
cosmic_sigs_df

Do this for Alexandrov et al, 2013 mutation signatures.

# Calculate mutations per signature
nature_sigs_df <- calc_mut_per_sig(sample_sigs_nature,
  muts_per_sample = total_muts,
  wgs_genome_size = wgs_size,
  wxs_exome_size = wxs_size,
  metadata = metadata_df
) 
## Using Tumor_Sample_Barcode, experimental_strategy, short_histology as id variables
# Write this to a file  
readr::write_tsv(nature_sigs_df, 
                 file.path(results_dir, "nature_signatures_results.tsv"))

# Print out a preview
nature_sigs_df

Mutation signature bubble matrix by histology groups

bubble_matrix_plot(cosmic_sigs_df, label = "COSMIC Signatures")
## Warning: Removed 558 rows containing missing values (geom_point).

ggplot2::ggsave(file.path(cosmic_plots, paste0("bubble_matrix_cosmic_mutation_sig.png")), width = 30, height = 20, units = "cm")
## Warning: Removed 558 rows containing missing values (geom_point).
bubble_matrix_plot(nature_sigs_df, label = "Alexandrov et al, 2013 signatures")
## Warning: Removed 452 rows containing missing values (geom_point).

ggplot2::ggsave(file.path(nature_plots, paste0("bubble_matrix_nature_mutation_sig.png")), width = 30, height = 20, units = "cm")
## Warning: Removed 452 rows containing missing values (geom_point).

Mutation signature grouped bar plots for each histology group

We will make these plots for primary tumor samples only. Let’s make these for COSMIC mutation signatures first.

# Keep only primary tumors
cosmic_sigs_primary <- cosmic_sigs_df %>%
  dplyr::filter(Tumor_Sample_Barcode %in% ind_samples$Kids_First_Biospecimen_ID)

# Make grouped bar plots
lapply(unique(cosmic_sigs_primary$short_histology),
  grouped_sig_barplot,
  sig_num_df = cosmic_sigs_primary,
  output_dir = file.path(cosmic_plots, "signature_grouped_barplots"), 
  label = "cosmic"
)
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image

Make these plots for Alexandrov et al, 2013 signatures.

# Keep only primary tumors
nature_sigs_primary <- nature_sigs_df %>%
  dplyr::filter(Tumor_Sample_Barcode %in% ind_samples$Kids_First_Biospecimen_ID)

# Make grouped bar plots
lapply(unique(nature_sigs_primary$short_histology),
  grouped_sig_barplot,
  sig_num_df = nature_sigs_primary,
  output_dir = file.path(nature_plots, "signature_grouped_barplots"),
  label = "nature"
)
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image

Determine Signatures de novo

This section is included, because the previous strategies imply that there are mutagenic mechanisms in operation that are unlikely to be a major factor in this dataset, such as smoking and UV light. This is happening due to overfitting problems when fitting existing signatures. De novo signature calling is used to provide an alternative for discussion. It is determined, which signatures can be extracted from the data themselves. These signatures are then fitted back to the known signatures that are used for the fitting approach above.

De novo calling

To prevent the loading of big data files into memory and intense processing, I have muted this part of the script and it loads the preprocessed parameters.

# determine the ideal number of signatures. 
# the choice of 10 signatures is now hard coded, because otherwise it has to be rerun.
# this step requires extensive ressources, the signature file is > 8 GB in size. Therefore a loop is included to load a preprocessed file, if available. 

if(!file.exists("results/sigfit_signatures_10.rdat")){
  
  samples_extr <- extract_signatures(counts = sigs_input,
                                        nsignatures = 5:15,
                                        iter = 1000, 
                                        seed = 42)
  dev.off()
  
  samples_extr10 <- extract_signatures(counts = sigs_input,
                                        nsignatures = 10,
                                        iter = 10000, 
                                        seed = 42)
    
  save(samples_extr10, file="results/sigfit_signatures_10.rdat")
}
if(!file.exists("results/sigfit_signatures_10_signatures.rdat")){
  load("results/sigfit_signatures_10.rdat")
  signatures <- retrieve_pars(samples_extr10,
                            par = "signatures")
  save(signatures,file="results/sigfit_signatures_10_signatures.rdat")
}else{
   load("results/sigfit_signatures_10_signatures.rdat")
}

if(!file.exists("results/sigfit_signatures_10_exposures.rdat")){
  load("results/sigfit_signatures_10.rdat")
  exposures <- retrieve_pars(samples_extr10,
                            par = "exposures")
  save(exposures,file="results/sigfit_signatures_10_exposures.rdat")
}else{
   load("results/sigfit_signatures_10_exposures.rdat")
}

if(!file.exists("results/sigfit_signatures_10_reconstructions.rdat")){
  load("results/sigfit_signatures_10.rdat")
  reconstructions <- retrieve_pars(samples_extr10,
                            par = "reconstructions")
  save(reconstructions,file="results/sigfit_signatures_10_reconstructions.rdat")
}else{
   load("results/sigfit_signatures_10_reconstructions.rdat")
}
  
  recmean <- reconstructions$mean
  rownames(recmean)<- rownames(sigs_input) 
  expmean <- exposures$mean
  explow95 <- exposures$lower_95
  rownames(expmean)<- rownames(sigs_input)
  
  # set exposures to 0, if lower confidence interval hits 1%
  expmean[explow95<0.01]<-0

Plotting signatures

pdf(file.path(denovo_plots, "denovo_signatures.pdf"), width=30, height=8)
par(mar=c(0,0,0,0)+7)
plot_spectrum(signatures)  
dev.off()
## quartz_off_screen 
##                 2

Comparing de novo signatures to published signature lists.
This also concludes cosmic 3, which is not included in the fitting approach. It would also not be rommended to use it in the fitting approach, as especially the now very detailed differentiation of signatures made the problems apparent.

  data("cosmic_signatures_v3")
  CNS_sigs <- read.csv("results/Cancer_Substitution_Signatures_CNS.csv")
    rownames(CNS_sigs)<-CNS_sigs[,1]
  CNS_sigs <- CNS_sigs[,-1]
  colnames(CNS_sigs) <- colnames(signatures.cosmic)
  sigmeans<-signatures$mean
  
pdf(file.path(denovo_plots, "CNS_signatures.pdf"), width=30, height=8)
par(mar=c(0,0,0,0)+7)
plot_spectrum(CNS_sigs)  
dev.off()
## quartz_off_screen 
##                 2
pdf(file.path(denovo_plots, "Nature2013_signatures.pdf"), width=30, height=8)
par(mar=c(0,0,0,0)+7)
plot_spectrum(signatures.nature2013)  
dev.off()
## quartz_off_screen 
##                 2
pdf(file.path(denovo_plots, "Cosmic_signatures.pdf"), width=30, height=8)
par(mar=c(0,0,0,0)+7)
plot_spectrum(signatures.cosmic)  
dev.off()
## quartz_off_screen 
##                 2
pdf(file.path(denovo_plots, "Cosmic3_signatures.pdf"), width=30, height=8)
par(mar=c(0,0,0,0)+7)
plot_spectrum(cosmic_signatures_v3)  
dev.off()
## quartz_off_screen 
##                 2

matching with CNS signatures (https://signal.mutationalsignatures.com/)

  library(lsa)
## Loading required package: SnowballC
  library(viridis)
## Loading required package: viridisLite
  library(ggplot2)
  library(cowplot)
## 
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
##   default ggplot2 theme anymore. To recover the previous
##   behavior, execute:
##   theme_set(theme_cowplot())
## ********************************************************
  library(reshape2)
  
  
  mat <- matrix(nrow=nrow(sigmeans),ncol=nrow(CNS_sigs))
  for(i in 1:nrow(sigmeans)){
    for(j in 1:nrow(CNS_sigs)){
      mat[i,j]<- cosine(as.numeric(sigmeans[i,]),as.numeric(CNS_sigs[j,]))
      }
    }
  df<- data.frame(mat)
  rownames(df)<- rownames(sigmeans)
  colnames(df)<- rownames(CNS_sigs)
  melted <- melt(df)
## No id variables; using all as measure variables
  colnames(melted) <- c("CNS_sigs","cosine")
  melted$denovo_sigs <- rep(rownames(df),ncol(df))
  
  ggplot(melted, aes(x=denovo_sigs,y=CNS_sigs, fill=cosine))+
   geom_tile()+
    scale_fill_viridis(direction=-1)+
    theme_cowplot()+
    theme(axis.text.x = element_text(angle = 90, hjust = 1))+
    NULL

  ggsave(file.path(denovo_plots, "CNS_signatures_cosine.pdf"), width=10,height=10)
  
 best_match <- data.frame(CNS_sig=apply(df,1,function(x) names(x)[which(x==max(x))]),
                     cosine=apply(df,1,function(x) x[which(x==max(x))]))
knitr::kable(best_match)
CNS_sig cosine
Signature A CNS_E 0.8361563
Signature B CNS_H 0.7454087
Signature C CNS_B 0.5376390
Signature D CNS_D 0.9180451
Signature E CNS_A 0.9680467
Signature F CNS_G 0.6770796
Signature G CNS_F 0.7196000
Signature H CNS_F 0.8286007
Signature I CNS_B 0.9912236
Signature J CNS_C 0.5276796

matching with Cosmic3

mat <- matrix(nrow=nrow(sigmeans),ncol=nrow(cosmic_signatures_v3))
  for(i in 1:nrow(sigmeans)){
    for(j in 1:nrow(cosmic_signatures_v3)){
      mat[i,j]<- cosine(as.numeric(sigmeans[i,]),as.numeric(cosmic_signatures_v3[j,]))
      }
    }
  df<- data.frame(mat)
  rownames(df)<- rownames(sigmeans)
  colnames(df)<- rownames(cosmic_signatures_v3)
  melted <- melt(df)
## No id variables; using all as measure variables
  colnames(melted) <- c("cosmic_signatures_v3","cosine")
  melted$denovo_sigs <- rep(rownames(df),ncol(df))
  
  ggplot(melted, aes(x=denovo_sigs,y=cosmic_signatures_v3, fill=cosine))+
   geom_tile()+
    scale_fill_viridis(direction=-1)+
    theme_cowplot()+
    theme(axis.text.x = element_text(angle = 90, hjust = 1))+
    NULL

  ggsave(file.path(denovo_plots, "cosmic3_signatures_cosine.pdf"), width=10,height=20)
  
 best_match <- data.frame(cosmic_signatures_v3=apply(df,1,function(x) names(x)[which(x==max(x))]),
                     cosine=apply(df,1,function(x) x[which(x==max(x))]))
knitr::kable(best_match)
cosmic_signatures_v3 cosine
Signature A SBS12 0.8749862
Signature B SBS40 0.8162940
Signature C SBS15 0.9508537
Signature D SBS8 0.8653469
Signature E SBS11 0.9559339
Signature F SBS31 0.9236046
Signature G SBS14 0.8201314
Signature H SBS18 0.9501174
Signature I SBS1 0.9912313
Signature J SBS9 0.6257608

Plot exposures

pdf(file.path(denovo_plots, "denovo_exposures.pdf"), width=60, height=15)
par(mar=c(0,0,0,0)+10)
plot_exposures(samples_extr10)
dev.off()

De novo signatures per sample

sample_sigs_denovo <- lapply(tumor_sample_ids, function(sample_id){
  list(
    weights = expmean[sample_id,],
    tumor = sigs_input[sample_id,]/sum(sigs_input[sample_id,]),
    product = recmean[sample_id,]/sum(sigs_input[sample_id,]),
    diff =  expmean[sample_id,]/sum(expmean[sample_id,])-recmean[sample_id,]/sum(recmean[sample_id,])
  )})                     

# Bring along the names
names(sample_sigs_denovo) <- tumor_sample_ids
# Calculate mutations per signature
denovo_sigs_df <- calc_mut_per_sig(sample_sigs_denovo,
  muts_per_sample = total_muts,
  wgs_genome_size = wgs_size,
  wxs_exome_size = wxs_size,
  metadata = metadata_df
) 
## Using Tumor_Sample_Barcode, experimental_strategy, short_histology as id variables
# Write this to a file  
readr::write_tsv(denovo_sigs_df, 
                 file.path(results_dir, "denovo_signatures_results.tsv"))

# Print out a preview
denovo_sigs_df

Bubble plot from de novo signatures

bubble_matrix_plot(denovo_sigs_df, label = "de novo signatures")
## Warning: Removed 86 rows containing missing values (geom_point).

ggplot2::ggsave(file.path(denovo_plots, paste0("bubble_matrix_denovo_mutation_sig.png")), width = 30, height = 20, units = "cm")
## Warning: Removed 86 rows containing missing values (geom_point).

Make plots for denovo signatures.

# Keep only primary tumors
denovo_sigs_primary <- denovo_sigs_df %>%
  dplyr::filter(Tumor_Sample_Barcode %in% ind_samples$Kids_First_Biospecimen_ID)

# Make grouped bar plots
lapply(unique(denovo_sigs_primary$short_histology),
  grouped_sig_barplot,
  sig_num_df = denovo_sigs_primary,
  output_dir = file.path(denovo_plots, "signature_grouped_barplots"),
  label = "denovo"
)
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image

Session Info

sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-apple-darwin17.7.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS/LAPACK: /usr/local/Cellar/openblas/0.3.6_1/lib/libopenblasp-r0.3.6.dylib
## 
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] reshape2_1.4.3        cowplot_1.0.0         ggplot2_3.3.0        
## [4] viridis_0.5.1         viridisLite_0.3.0     lsa_0.73.1           
## [7] SnowballC_0.7.0       sigfit_1.3.2          deconstructSigs_1.8.0
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-6                      matrixStats_0.56.0               
##  [3] RColorBrewer_1.1-2                GenomeInfoDb_1.22.0              
##  [5] rstan_2.19.3                      tools_3.6.1                      
##  [7] R6_2.4.1                          BiocGenerics_0.32.0              
##  [9] colorspace_1.4-1                  withr_2.1.2                      
## [11] tidyselect_1.0.0                  gridExtra_2.3                    
## [13] prettyunits_1.1.1                 processx_3.4.2                   
## [15] compiler_3.6.1                    cli_2.0.2                        
## [17] Biobase_2.44.0                    DelayedArray_0.10.0              
## [19] rtracklayer_1.44.4                labeling_0.3                     
## [21] scales_1.1.0                      readr_1.3.1                      
## [23] callr_3.4.3                       stringr_1.4.0                    
## [25] digest_0.6.25                     Rsamtools_2.2.3                  
## [27] StanHeaders_2.19.2                rmarkdown_2.1                    
## [29] R.utils_2.9.2                     XVector_0.26.0                   
## [31] pkgconfig_2.0.3                   htmltools_0.4.0                  
## [33] highr_0.8                         BSgenome_1.52.0                  
## [35] rlang_0.4.5                       farver_2.0.3                     
## [37] jsonlite_1.6.1                    BiocParallel_1.20.1              
## [39] dplyr_0.8.5                       R.oo_1.23.0                      
## [41] inline_0.3.15                     RCurl_1.98-1.1                   
## [43] magrittr_1.5                      GenomeInfoDbData_1.2.2           
## [45] loo_2.2.0                         Matrix_1.2-18                    
## [47] Rcpp_1.0.4                        munsell_0.5.0                    
## [49] S4Vectors_0.24.3                  fansi_0.4.1                      
## [51] lifecycle_0.2.0                   R.methodsS3_1.8.0                
## [53] stringi_1.4.6                     yaml_2.2.1                       
## [55] SummarizedExperiment_1.14.1       zlibbioc_1.32.0                  
## [57] pkgbuild_1.0.6                    plyr_1.8.6                       
## [59] grid_3.6.1                        parallel_3.6.1                   
## [61] forcats_0.5.0                     crayon_1.3.4                     
## [63] lattice_0.20-40                   Biostrings_2.54.0                
## [65] hms_0.5.3                         BSgenome.Hsapiens.UCSC.hg38_1.4.1
## [67] knitr_1.28                        ps_1.3.2                         
## [69] pillar_1.4.3                      GenomicRanges_1.38.0             
## [71] codetools_0.2-16                  stats4_3.6.1                     
## [73] XML_3.99-0.3                      glue_1.3.2                       
## [75] evaluate_0.14                     data.table_1.12.8                
## [77] vctrs_0.2.4                       gtable_0.3.0                     
## [79] purrr_0.3.3                       clue_0.3-57                      
## [81] assertthat_0.2.1                  xfun_0.12                        
## [83] coda_0.19-3                       tibble_3.0.0                     
## [85] GenomicAlignments_1.20.1          IRanges_2.20.2                   
## [87] cluster_2.1.0                     ellipsis_0.3.0